Todo:

  • answer research question
rm(list=ls())

Load requisite packages and data

library(ezids)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyverse)
library (tidyr)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)

This project uses the State and County Housing Market Indicators dataset from the American Enterprise Institute Housing Center, found here. The variables are:

Original Variable Name New Variable Name Definition
State State state
County County_Name County
FIPS FIPS_County_Code 5-digit Federal Information Processing Series codes (first 2 digits indicate state, last 3 indicate sub-county entity)
Year Year Year when the data was collected
Tier Affordability Categorizes home sales into entry-level (<=80th percentile of FHA sales prices), move-up (all others), and all
Median.Sale.Price..in.Thousands. Median_Sale_Price_in_k Median sale price in thousands of USD per county
House.Price.Appreciation.since.2012 House_Price_Appreciation_since_2012_percent Cumulative home price appreciation since 2012
House.Price.Appreciation..Year.over.Year House_Price_Appreciation_yr_over_yr_percent Home price appreciation since the previous year
Months..Supply Months_Supply Number of months it would take for the inventory of existing homes for sale to be exhausted at the current sales pace
New.Construction.Share.of.Sales New_Constr_by_share_of_sales_percent Percent of sales comprising new construction
Mortgage.Default.Rate Mortgage_Default_Rate_percent AEI Mortgage Default Rate, a measure of how loans originating in a given month would perform under the same conditions as the 2007 financial crisis (<=7%: Low Risk; between 7.01% and 14%: Medium Risk; >14%: High Risk)
housing = read.csv("/Users/ilgazkuscu/Documents/GitHub/housing-price-vs-supply-2024/Data/state_county_data_download_2025.csv")
housing %>% slice_sample(n=5)
##   State         County  FIPS Year       Tier Median.Sale.Price..in.Thousands.
## 1    IA Webster County 19187 2024        all                            $126 
## 2    NH    Coos County 33007 2015 entrylevel                             $64 
## 3    GA   Butts County 13035 2016 entrylevel                            $100 
## 4    KY  Warren County 21227 2014     moveup                            $257 
## 5    WI   Adams County 55001 2017 entrylevel                             $70 
##   House.Price.Appreciation.since.2012 House.Price.Appreciation..Year.over.Year.
## 1                              85.40%                                     4.10%
## 2                               3.50%                                     5.80%
## 3                              31.80%                                     7.80%
## 4                               4.60%                                     2.90%
## 5                               9.50%                                     5.30%
##   Months..Supply New.Construction.Share.of.Sales Mortgage.Default.Rate
## 1            2.7                           1.60%                17.30%
## 2           12.3                           1.70%                13.30%
## 3            1.6                           7.10%                26.10%
## 4            7.9                          18.40%                10.40%
## 5            6.0                           0.90%

The data is limited to the year 2024 and cleaned of NA values, and the variables are renamed for clarity.

housing_2024 = housing %>% filter(housing$Year == 2024, housing$State != 'AA National') %>% na.omit %>%


#rename cols
rename(
  Median_Sale_Price_per_k = Median.Sale.Price..in.Thousands.,
  House_Price_Appreciation_yr_over_yr_percent = House.Price.Appreciation..Year.over.Year.,
  House_Price_Appreciation_since_2012_percent = House.Price.Appreciation.since.2012,
  Months_Supply = Months..Supply,
  New_Constr_by_share_of_sales_percent = New.Construction.Share.of.Sales,
  Mortgage_Default_Rate_percent = Mortgage.Default.Rate,
  County_Name = County,
  FIPS_County_Code = FIPS, 
  Affordability = Tier
)
head(housing_2024)
##   State                County_Name FIPS_County_Code Year Affordability
## 1    AK                   AA State                0 2024           all
## 2    AK                   AA State                0 2024    entrylevel
## 3    AK                   AA State                0 2024        moveup
## 7    AK Aleutians West Census Area             2016 2024           all
## 8    AK Aleutians West Census Area             2016 2024    entrylevel
## 9    AK Aleutians West Census Area             2016 2024        moveup
##   Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1                   $394                                       59.60%
## 2                   $314                                       59.90%
## 3                   $564                                       57.40%
## 7                                                                    
## 8                                                                    
## 9                                                                    
##   House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1                                       5.40%           3.2
## 2                                       5.60%           2.7
## 3                                       5.00%           4.2
## 7                                                       1.8
## 8                                                       2.8
## 9                                                       1.6
##   New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1                                6.30%                        12.50%
## 2                                4.60%                        13.50%
## 3                                9.50%                        10.20%
## 7                                                                   
## 8                                                                   
## 9

The typing of the variables is also corrected. Some require the symbols “$” and “%” to be removed beforehand, so that is also done.

# as factors
housing_2024$State = as.factor(housing_2024$State)
housing_2024$County_Name = as.factor(housing_2024$County_Name)
housing_2024$FIPS_County_Code = as.factor(housing_2024$FIPS_County_Code)
housing_2024$Affordability = as.factor(housing_2024$Affordability)

# remove prefixes '$' and '%' from values
housing_2024 = housing_2024 %>%
  mutate(Median_Sale_Price_per_k = gsub("\\$", "", Median_Sale_Price_per_k),
         House_Price_Appreciation_since_2012_percent =
           gsub("%","",House_Price_Appreciation_since_2012_percent),
         House_Price_Appreciation_yr_over_yr_percent =
           gsub("%","",House_Price_Appreciation_yr_over_yr_percent),
         New_Constr_by_share_of_sales_percent = gsub("%","",New_Constr_by_share_of_sales_percent),
         Mortgage_Default_Rate_percent = gsub("%","",Mortgage_Default_Rate_percent)
  )

# as num instead of chr
housing_2024$Median_Sale_Price_per_k = as.numeric(housing_2024$Median_Sale_Price_per_k)
housing_2024$House_Price_Appreciation_since_2012_percent =
  as.numeric(housing_2024$House_Price_Appreciation_since_2012_percent)
housing_2024$House_Price_Appreciation_yr_over_yr_percent =
  as.numeric(housing_2024$House_Price_Appreciation_yr_over_yr_percent)
housing_2024$New_Constr_by_share_of_sales_percent = 
  as.numeric(housing_2024$New_Constr_by_share_of_sales_percent)
housing_2024$Mortgage_Default_Rate_percent = as.numeric(housing_2024$Mortgage_Default_Rate_percent)

# For some reason is rounding the data in quite a weird way—inaccurately

# view data
housing_2024 %>% slice_sample(n=5)
##   State      County_Name FIPS_County_Code Year Affordability
## 1    MI  Mackinac County            26097 2024    entrylevel
## 2    MO    Ripley County            29181 2024           all
## 3    OH Muskingum County            39119 2024           all
## 4    AR   Calhoun County             5013 2024        moveup
## 5    AL   Cullman County             1043 2024        moveup
##   Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent
## 1                      NA                                          NA
## 2                     161                                         115
## 3                     167                                         114
## 4                     253                                          NA
## 5                     410                                          NA
##   House_Price_Appreciation_yr_over_yr_percent Months_Supply
## 1                                          NA           5.9
## 2                                         9.3           2.1
## 3                                         6.1           2.2
## 4                                          NA           5.3
## 5                                          NA           7.4
##   New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
## 1                                   NA                            NA
## 2                                  5.2                          17.2
## 3                                  2.6                          17.1
## 4                                  0.0                          16.9
## 5                                 27.4                          10.0
# def= 
#   {
#     Low: all sales below the 40th percentile of FHA sales prices
#     Low-medium: all sales at or below the 80th percentile of FHA sales prices
#     Medium-high: all sales at or below 125% of the GSE loan limit
#     High: all other sales
#     entry-level: low and low-medium price tiers
#     move-up: medium-high and high price tiers
#   }

Exploratory Data Analysis (EDA)

xkablesummary(housing_2024)
Table: Statistics summary.
State County_Name FIPS_County_Code Year Affordability Median_Sale_Price_per_k House_Price_Appreciation_since_2012_percent House_Price_Appreciation_yr_over_yr_percent Months_Supply New_Constr_by_share_of_sales_percent Mortgage_Default_Rate_percent
Min TX : 733 AA State : 153 0 : 153 Min. :2024 all :3097 Min. : 14 Min. : 3.7 Min. :-32.90 Min. : 0.00 Min. : 0.0 Min. : 0.2
Q1 GA : 480 Washington County: 87 1001 : 3 1st Qu.:2024 entrylevel:3094 1st Qu.:150 1st Qu.: 86.6 1st Qu.: 2.80 1st Qu.: 2.20 1st Qu.: 2.7 1st Qu.:10.2
Median KY : 363 Jefferson County : 74 1003 : 3 Median :2024 moveup :3047 Median :260 Median :105.2 Median : 5.70 Median : 3.20 Median : 6.9 Median :13.9
Mean MO : 337 Franklin County : 72 1005 : 3 Mean :2024 NA Mean :286 Mean :110.8 Mean : 5.87 Mean : 4.36 Mean : 10.7 Mean :14.2
Q3 VA : 326 Jackson County : 68 1007 : 3 3rd Qu.:2024 NA 3rd Qu.:390 3rd Qu.:129.7 3rd Qu.: 8.60 3rd Qu.: 5.20 3rd Qu.: 15.0 3rd Qu.:17.4
Max IL : 307 Lincoln County : 66 1009 : 3 Max. :2024 NA Max. :999 Max. :279.1 Max. : 89.20 Max. :24.00 Max. :100.0 Max. :36.0
NA (Other):6692 (Other) :8718 (Other):9070 NA NA NA’s :992 NA’s :2177 NA’s :2204 NA NA’s :859 NA’s :1963

boxplot of median sale price by state

ggplot(housing_2024, aes(x = reorder(State, -Median_Sale_Price_per_k, median), 
                         y = Median_Sale_Price_per_k)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  coord_flip() +
  labs(
    title = "Distribution of Median Sale Prices by State (2024)",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

There are too many to be particularly useful. You can see general trends, but I am going to run this with a smaller state sample.

boxplot of top and bottom 5 states by housing count

top_states <- housing_2024 %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_max(n, n = 5)

bottom_states <- housing_2024 %>%
  dplyr::count(State, sort = TRUE) %>%
  dplyr::slice_min(n, n = 5)

# merge top and bottom states
housing_compare <- housing_2024 %>%
  filter(State %in% c(top_states$State, bottom_states$State)) %>%
  mutate(StateGroup = case_when(
    State %in% top_states$State ~ "States with the Most Houses",
    State %in% bottom_states$State ~ "States with the Fewest Houses"
  ))%>%
  mutate(StateGroup = factor(StateGroup, levels = c(
    "States with the Most Houses",
    "States with the Fewest Houses"
  )))

#plot top bottom comparison ####
ggplot(housing_compare, aes(x = State, y = Median_Sale_Price_per_k, fill = StateGroup)) +
  geom_boxplot() +
  facet_wrap(~ StateGroup, scales = "free_x") +
  labs(
    title = "Median Sale Price in States with Most vs Least Housing Records",
    subtitle = "The Median Sale Price in States with a Larger Supply of Houses is Significantly Lower\nthan States with a Smaller Supply of Houses",
    x = "State",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal() +
  scale_fill_manual(values = c("States with the Most Houses" = "skyblue", "States with the Fewest Houses" = "red"))

hist of median sale price

ggplot(housing_2024, aes(x = Median_Sale_Price_per_k)) +
  geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "$", suffix = "k")) +
  labs(
    title = "Distribution of Median Sale Prices (2024)",
    x = "Median Sale Price (in thousands)",
    y = "Count"
  ) +
  theme_minimal()

#needs a subtitle with commentary, just remembered all need a caption with the source named, and we can prob add other elements to this, because otherwise it seems to obvi to me

boxplot of sale price by affordability tier

ggplot(housing_2024, aes(x = Affordability, y = Median_Sale_Price_per_k, fill = Affordability)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Affordability Tier (2024)",
    x = "Affordability Tier",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

#no duh, there are more unpurchased expensive houses because people can't afford it 
#not sure how useful this is, but maybe as a starting baseline 

scatterplot of months supply vs median price

ggplot(housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k)) +
  geom_point(aes(color = Affordability), alpha = 0.7) +
  # geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Housing Supply vs Median Sale Price",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

#this is a good one, needs a commentary subtitle and some cleaning
# Not at all what I would have expected. I wonder why this is?

correlation heatmap of numeric variables

housing_numeric <- housing_2024 %>%
  select(where(is.numeric)) %>%
  drop_na()

cor_matrix <- cor(housing_numeric)

corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8, addCoef.col = "black", number.cex=0.5)

#interesting to see positive and inverse relationships between variables 
#did not scan for no relationships

compare color scale

# Create a vector of colors for top states (blue shades) and bottom states (red shades)
top_states_colors <- scales::seq_gradient_pal("lightblue", "navy")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"]))))
names(top_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Most Houses"])

bottom_states_colors <- scales::seq_gradient_pal("lightpink", "darkred")(seq(0, 1, length.out = length(unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"]))))
names(bottom_states_colors) <- unique(housing_compare$State[housing_compare$StateGroup == "States with the Fewest Houses"])

#last point in line for state label 
label_points <- housing_compare %>%
  group_by(State) %>%
  filter(Months_Supply == max(Months_Supply, na.rm = TRUE)) %>%
  ungroup()

# Combine into one color vector
state_colors <- c(top_states_colors, bottom_states_colors)

#all in gray with faceted compare in color with states labeled ####
ggplot() +
  geom_point(data = housing_2024, aes(x = Months_Supply, y = Median_Sale_Price_per_k),
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare,
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           size = 1, alpha = 0.8) +
  geom_point(data = housing_compare,
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             size = 2, alpha = 0.5) +
  geom_text_repel(data = label_points,
                  aes(x = Months_Supply, y = Median_Sale_Price_per_k, 
                      label = State, color=State),
                  size = 3.5, stroke=0.01, show.legend = FALSE) +
  facet_wrap(~ StateGroup) +
  scale_color_manual(values = state_colors) +
  labs(
    title = "Housing Supply vs Median Price by State with Grouped Colors",
    subtitle = "Some commentary here",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

#can't read state names, maybe find a way to make it stand out 

both together in gray with compare in color with states labeled

ggplot() +
  geom_point(data = housing_2024, 
             aes(x = Months_Supply, y = Median_Sale_Price_per_k), 
             color = "gray70", alpha = 0.3, size = 1) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare %>% filter(StateGroup == "States with the Most Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  # geom_path(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
  #           aes(x = Months_Supply, y = Median_Sale_Price_per_k, group = State, color = State),
  #           alpha = 0.8, size = 1) +
  geom_point(data = housing_compare %>% filter(StateGroup == "States with the Fewest Houses"),
             aes(x = Months_Supply, y = Median_Sale_Price_per_k, color = State),
             alpha = 0.5, size = 2) +
  scale_color_manual(values = state_colors) +
  labs(
    title = "Housing Supply vs Median Price: All Counties with Highlights",
    subtitle = "Gray points: All counties | Blue shades: States with Most Houses | Red shades: States with Fewest Houses",
    x = "Months of Supply",
    y = "Median Sale Price (in thousands)",
    color = "State"
  ) +
  theme_minimal()

new construction vs median sale price

housing_constr = housing_2024
housing_constr = housing_constr %>% 
  mutate(constr_bins = cut(New_Constr_by_share_of_sales_percent, breaks=10))

ggplot(housing_constr, aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

# Perhaps new construction indicates high demand, and as such, the more new construction, the higher the median sale price?

Limit to Entry-Level houses

ggplot(filter(housing_constr,Affordability=="entrylevel"), aes(x = constr_bins, y = Median_Sale_Price_per_k,fill=constr_bins)) +
  geom_boxplot() +
  labs(
    title = "Median Sale Price by Percent New Construction (2024)",
    x = "New Construction by Share of Sales Percent",
    y = "Median Sale Price (in thousands)"
  ) +
  theme_minimal()

# Even more pronounced here